Introduction

In genomic selection and quantitative genetics, researchers often have access to genomic relationship matrices (GRMs) calculated from different marker sets across overlapping but not identical sets of individuals. This vignette demonstrates how to use CovCombR to combine multiple GRMs estimated from different populations with:

We will simulate 10 different GRMs, each calculated on a subset of individuals from a larger population, using different sets of markers with population-specific allele frequencies.

Simulation Design

Population Structure

We create a realistic scenario where:

  1. A large population of \(n = 100\) genotypes exists with realistic population structure
  2. We have 10 different studies/marker panels, each genotyping overlapping subsets of these 100 individuals
  3. Each study uses a different set of markers (with some potential overlap)
  4. Each study has slightly different allele frequencies due to population substructure

Van Raden GRM

For each population and marker set, we calculate the Van Raden (2008) GRM:

\[ \mathbf{G} = \frac{\mathbf{Z}\mathbf{Z}'}{\text{tr}(\mathbf{Z}\mathbf{Z}')/n} \]

where \(\mathbf{Z}\) is the centered and scaled marker matrix.

Data Generation

Step 1: Generate Base Population Structure

First, we create a realistic population structure by simulating a base relationship matrix with population substructure. We’ll create 3 subpopulations with different levels of relatedness.

# Simulation parameters
n_total <- 1000 # Total genotypes
n_markers_study <- 5000 # Total markers per study
min_genotypes_per_study <- 200 # Minimum genotypes sampled per study
max_genotypes_per_study <- 400 # Maximum genotypes sampled per study
n_markers_per_study <- n_markers_study # Alias for inline code
n_studies <- 10

# We'll generate a smaller set of base markers to save time/code for the vignette
n_markers_base_sim <- 10000
n_markers_total <- n_markers_base_sim # Alias for total markers used in inline code

# Generate base population structure (3 subpopulations)
subpop_sizes <- c(400, 350, 250) # Scaled up to sum to 1000
subpop_labels <- rep(1:3, times = subpop_sizes)
genotype_ids <- paste0("G", sprintf("%03d", 1:n_total))

# Generate true correlation matrix with block structure
true_cor <- matrix(0.1, n_total, n_total)
for (i in 1:3) {
  idx <- which(subpop_labels == i)
  true_cor[idx, idx] <- 0.6
}
diag(true_cor) <- 1
# Ensure PD
eig <- eigen(true_cor, symmetric = TRUE)
true_cor <- eig$vectors %*% diag(pmax(eig$values, 0.01)) %*% t(eig$vectors)
true_cor <- cov2cor(true_cor)
dimnames(true_cor) <- list(genotype_ids, genotype_ids)

# Generate marker data consistent with this structure
L <- chol(true_cor)
marker_mat <- matrix(0, n_total, n_markers_base_sim)
marker_freqs <- runif(n_markers_base_sim, 0.1, 0.9)

# Helper for VanRaden GRM
calc_grm <- function(M, ids = NULL) {
  p <- colMeans(M) / 2
  Z <- M - matrix(2 * p, nrow(M), ncol(M), byrow = TRUE)
  G <- tcrossprod(Z) / (sum(diag(tcrossprod(Z))) / nrow(M))
  if (!is.null(ids)) dimnames(G) <- list(ids, ids)
  G
}

# Generate markers (simplified for vignette)
# Using a subset of markers for speed in this example
for (m in 1:n_markers_base_sim) {
  z <- crossprod(L, rnorm(n_total))
  p <- marker_freqs[m]
  thresh1 <- qnorm((1 - p)^2)
  thresh2 <- qnorm((1 - p)^2 + 2 * p * (1 - p))
  marker_mat[, m] <- ifelse(z < thresh1, 0, ifelse(z < thresh2, 1, 2))
}
rownames(marker_mat) <- genotype_ids
colnames(marker_mat) <- paste0("M", 1:n_markers_base_sim)

true_grm <- calc_grm(marker_mat, genotype_ids)

# Generate Study GRMs with overlap
study_grms <- list()
study_genotypes <- list()
study_markers <- list()

set.seed(2025)
for (k in 1:n_studies) {
  # Sample genotypes with overlap
  # Higher probability for already sampled individuals to ensure connectivity
  if (k == 1) {
    prob <- rep(1, n_total)
  } else {
    sampled <- unique(unlist(study_genotypes))
    prob <- rep(1, n_total)
    prob[which(genotype_ids %in% genotype_ids[sampled])] <- 3 # 3x more likely to be resampled
  }

  n_s <- sample(200:400, 1)
  s_idx <- sort(sample(1:n_total, n_s, prob = prob))
  study_genotypes[[k]] <- s_idx

  # Sample markers
  m_idx <- sort(sample(1:n_markers_base_sim, n_markers_study))
  study_markers[[k]] <- m_idx

  # Calculate GRM
  M_sub <- marker_mat[s_idx, m_idx]
  study_grms[[k]] <- calc_grm(M_sub, genotype_ids[s_idx])
}
names(study_grms) <- paste0("Study", 1:n_studies)

# Track marker counts for each study (needed for degrees of freedom)
study_marker_counts <- sapply(study_markers, length)

Step 5: Summary of Study Overlap

# Calculate pairwise overlap between studies (genotypes)
genotype_overlap_matrix <- matrix(0, n_studies, n_studies)
for (i in 1:n_studies) {
  for (j in 1:n_studies) {
    genotype_overlap_matrix[i, j] <- length(intersect(
      study_genotypes[[i]],
      study_genotypes[[j]]
    ))
  }
}

colnames(genotype_overlap_matrix) <- paste0("S", 1:n_studies)
rownames(genotype_overlap_matrix) <- paste0("S", 1:n_studies)

cat("\nGenotype overlap between studies:\n")
## 
## Genotype overlap between studies:
print(genotype_overlap_matrix)
##      S1  S2  S3  S4  S5  S6  S7  S8  S9 S10
## S1  340 159 187 181 153 151 117 122 162  93
## S2  159 278 162 147 125 121  94  85 122  75
## S3  187 162 375 186 174 160 116 121 168 106
## S4  181 147 186 385 179 154 125 133 176 103
## S5  153 125 174 179 372 160 123 130 167 107
## S6  151 121 160 154 160 362 123 131 159 106
## S7  117  94 116 125 123 123 270 105 120  76
## S8  122  85 121 133 130 131 105 293 125  96
## S9  162 122 168 176 167 159 120 125 393 114
## S10  93  75 106 103 107 106  76  96 114 271
# Calculate pairwise marker overlap between studies
marker_overlap_matrix <- matrix(0, n_studies, n_studies)
for (i in 1:n_studies) {
  for (j in 1:n_studies) {
    marker_overlap_matrix[i, j] <- length(intersect(
      study_markers[[i]],
      study_markers[[j]]
    ))
  }
}

colnames(marker_overlap_matrix) <- paste0("S", 1:n_studies)
rownames(marker_overlap_matrix) <- paste0("S", 1:n_studies)

cat("\nMarker overlap between studies:\n")
## 
## Marker overlap between studies:
print(marker_overlap_matrix)
##       S1   S2   S3   S4   S5   S6   S7   S8   S9  S10
## S1  5000 2492 2441 2517 2482 2519 2485 2472 2530 2495
## S2  2492 5000 2520 2507 2527 2487 2535 2479 2547 2463
## S3  2441 2520 5000 2520 2538 2432 2492 2516 2485 2478
## S4  2517 2507 2520 5000 2524 2467 2472 2483 2517 2500
## S5  2482 2527 2538 2524 5000 2493 2487 2490 2472 2463
## S6  2519 2487 2432 2467 2493 5000 2449 2473 2517 2472
## S7  2485 2535 2492 2472 2487 2449 5000 2481 2479 2501
## S8  2472 2479 2516 2483 2490 2473 2481 5000 2487 2533
## S9  2530 2547 2485 2517 2472 2517 2479 2487 5000 2439
## S10 2495 2463 2478 2500 2463 2472 2501 2533 2439 5000
cat("\nStudy coverage statistics:\n")
## 
## Study coverage statistics:
all_genotypes_covered <- unique(unlist(study_genotypes))
cat(sprintf(
  "  Genotypes: %d / %d covered (%.1f%%)\n",
  length(all_genotypes_covered), n_total,
  100 * length(all_genotypes_covered) / n_total
))
##   Genotypes: 860 / 1000 covered (86.0%)
genotype_frequency <- table(unlist(study_genotypes))
cat(sprintf(
  "  Mean studies per genotype: %.1f (range: %d-%d)\n",
  mean(genotype_frequency),
  min(genotype_frequency), max(genotype_frequency)
))
##   Mean studies per genotype: 3.9 (range: 1-9)
all_markers_used <- unique(unlist(study_markers))
cat(sprintf(
  "  Markers: %d / %d used (%.1f%%)\n",
  length(all_markers_used), n_markers_base_sim,
  100 * length(all_markers_used) / n_markers_base_sim
))
##   Markers: 9993 / 10000 used (99.9%)
marker_frequency <- table(unlist(study_markers))
cat(sprintf(
  "  Mean studies per marker: %.1f (range: %d-%d)\n",
  mean(marker_frequency),
  min(marker_frequency), max(marker_frequency)
))
##   Mean studies per marker: 5.0 (range: 1-10)

Comparison: True vs Sample GRMs

For genotypes that appear in multiple studies, we can compare the estimated relationships.

# Find genotypes that appear in at least 3 studies
# Convert study_genotypes to genotype IDs
all_study_genotype_ids <- lapply(study_genotypes, function(idx) genotype_ids[idx])
genotype_id_counts <- table(unlist(all_study_genotype_ids))
well_sampled_genotypes <- names(genotype_id_counts)[genotype_id_counts >= 3]

if (length(well_sampled_genotypes) >= 10) {
  # Select 10 well-sampled genotypes
  selected_genotypes <- sample(
    well_sampled_genotypes,
    min(10, length(well_sampled_genotypes))
  )

  # For each pair, collect estimates from different studies
  comparison_data <- list()

  for (i in 1:(length(selected_genotypes) - 1)) {
    for (j in (i + 1):length(selected_genotypes)) {
      g1 <- selected_genotypes[i]
      g2 <- selected_genotypes[j]

      true_value <- true_grm[g1, g2]

      # Find studies that have both genotypes
      studies_with_pair <- which(sapply(all_study_genotype_ids, function(study_ids) {
        (g1 %in% study_ids) && (g2 %in% study_ids)
      }))

      if (length(studies_with_pair) > 0) {
        for (study in studies_with_pair) {
          study_value <- study_grms[[study]][g1, g2]
          comparison_data[[length(comparison_data) + 1]] <- data.frame(
            Genotype1 = g1,
            Genotype2 = g2,
            Study = study,
            True_Relationship = true_value,
            Estimated_Relationship = study_value,
            Error = study_value - true_value
          )
        }
      }
    }
  }

  comparison_df <- do.call(rbind, comparison_data)

  # Plot: True vs Estimated
  ggplot(comparison_df, aes(x = True_Relationship, y = Estimated_Relationship)) +
    geom_point(alpha = 0.6, size = 2) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
    labs(
      title = "Sample GRM vs True GRM",
      subtitle = "Relationships between well-sampled genotype pairs",
      x = "True Relationship (Population GRM)",
      y = "Estimated Relationship (Study GRM)"
    ) +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5, face = "bold"))

  cat("\nComparison statistics:\n")
  cat(sprintf(
    "  Number of genotype pairs compared: %d\n",
    nrow(comparison_df)
  ))
  cat(sprintf("  RMSE: %.4f\n", sqrt(mean(comparison_df$Error^2))))
  cat(sprintf("  MAE: %.4f\n", mean(abs(comparison_df$Error))))
  cat(sprintf(
    "  Correlation: %.4f\n",
    cor(
      comparison_df$True_Relationship,
      comparison_df$Estimated_Relationship
    )
  ))
}
## 
## Comparison statistics:
##   Number of genotype pairs compared: 64
##   RMSE: 0.0148
##   MAE: 0.0122
##   Correlation: 0.9977

Aligned GRM Visualization

To better understand the coverage and quality of each sample GRM, we visualize all GRMs using the same ordering based on the full population GRM. Unobserved relationships are shown in gray.

# Create full population matrices for each study, with NA for unobserved pairs
study_grms_full <- lapply(1:n_studies, function(k) {
  # Create a matrix for the full population
  full_grm <- matrix(NA, n_total, n_total)
  rownames(full_grm) <- genotype_ids
  colnames(full_grm) <- genotype_ids

  # Fill in observed values
  study_idx <- study_genotypes[[k]]
  study_ids <- genotype_ids[study_idx]
  full_grm[study_ids, study_ids] <- study_grms[[k]]

  return(full_grm)
})

# Order genotypes by hierarchical clustering of the true GRM
# This will group similar genotypes together
hc <- hclust(as.dist(1 - true_grm), method = "average")
ordered_genotype_ids <- genotype_ids[hc$order]

# Helper function to plot aligned GRM heatmap
plot_aligned_grm <- function(grm_matrix, title = "GRM", ordered_ids,
                             show_observed_only = FALSE,
                             text_size = 8) {
  n <- nrow(grm_matrix)

  # Reorder matrix
  grm_ordered <- grm_matrix[ordered_ids, ordered_ids]

  # Convert to long format for ggplot
  grm_long <- expand.grid(
    Row = 1:n,
    Col = 1:n
  )
  grm_long$value <- as.vector(grm_ordered)
  grm_long$is_observed <- !is.na(grm_long$value)

  # For visualization, set NA to a specific value (will be colored gray)
  grm_long$value_plot <- ifelse(is.na(grm_long$value), -999, grm_long$value)

  # Create plot
  p <- ggplot(grm_long, aes(x = Col, y = n - Row + 1)) +
    geom_tile(aes(fill = value_plot, alpha = is_observed)) +
    scale_fill_gradient2(
      low = "#2166AC",
      mid = "#F7F7F7",
      high = "#B2182B",
      midpoint = 0.5,
      limits = c(-0.5, 2),
      name = "Relationship",
      na.value = "#808080"
    ) +
    scale_alpha_manual(
      values = c("TRUE" = 1, "FALSE" = 0.3),
      guide = "none"
    ) +
    labs(title = title, x = NULL, y = NULL) +
    theme_minimal(base_size = text_size) +
    theme(
      axis.text.x = element_blank(),
      axis.text.y = element_blank(),
      axis.ticks = element_blank(),
      panel.grid = element_blank(),
      plot.title = element_text(hjust = 0.5, face = "bold"),
      legend.position = "right",
      legend.key.height = unit(0.3, "cm"),
      legend.key.width = unit(0.15, "cm")
    ) +
    coord_equal()

  return(p)
}

# Create aligned heatmaps for all studies
aligned_study_plots <- lapply(1:n_studies, function(k) {
  n_obs <- length(study_genotypes[[k]])
  n_pairs_observed <- n_obs * (n_obs + 1) / 2
  coverage_pct <- 100 * n_pairs_observed / (n_total * (n_total + 1) / 2)

  plot_aligned_grm(
    study_grms_full[[k]],
    title = sprintf("Study %d\n(%.1f%% coverage)", k, coverage_pct),
    ordered_ids = ordered_genotype_ids,
    text_size = 8
  )
})

# Create aligned heatmap for true GRM
true_grm_aligned_plot <- plot_aligned_grm(
  true_grm,
  title = "True Population GRM\n(100% coverage)",
  ordered_ids = ordered_genotype_ids,
  text_size = 8
)

# Arrange all plots in a grid (1 true + n_studies)
all_aligned_plots <- c(list(true_grm_aligned_plot), aligned_study_plots)
n_plots_aligned <- length(all_aligned_plots)
n_cols_aligned <- 3
n_rows_aligned <- ceiling(n_plots_aligned / n_cols_aligned)
grid.arrange(grobs = all_aligned_plots, ncol = n_cols_aligned, nrow = n_rows_aligned)

Aligned heatmaps showing observed and missing entries for each study GRM alongside the true GRM.

This visualization shows:

The ordering reveals the population structure, and we can see which parts of the population structure each study captures.

# Calculate coverage by population structure blocks
# Define blocks based on subpopulation membership (after ordering)
subpop_labels_ordered <- subpop_labels[hc$order]

# For each study, calculate coverage within and between subpopulations
coverage_summary <- do.call(rbind, lapply(1:n_studies, function(k) {
  study_idx <- study_genotypes[[k]]
  study_subpops <- subpop_labels[study_idx]

  # Count coverage for each subpopulation pair
  coverage_data <- expand.grid(Pop1 = 1:3, Pop2 = 1:3)
  coverage_data$Coverage <- sapply(seq_len(nrow(coverage_data)), function(i) {
    pop1 <- coverage_data$Pop1[i]
    pop2 <- coverage_data$Pop2[i]

    # Genotypes in each subpopulation
    pop1_all <- which(subpop_labels == pop1)
    pop2_all <- which(subpop_labels == pop2)

    # Genotypes observed in study
    pop1_obs <- sum(study_idx %in% pop1_all)
    pop2_obs <- sum(study_idx %in% pop2_all)

    # Number of observed pairs
    if (pop1 == pop2) {
      n_pairs_total <- length(pop1_all) * (length(pop1_all) + 1) / 2
      n_pairs_obs <- pop1_obs * (pop1_obs + 1) / 2
    } else {
      n_pairs_total <- length(pop1_all) * length(pop2_all)
      n_pairs_obs <- pop1_obs * pop2_obs
    }

    100 * n_pairs_obs / n_pairs_total
  })

  coverage_data$Study <- paste0("Study ", k)
  return(coverage_data)
}))

# Plot coverage heatmap
ggplot(coverage_summary, aes(x = Pop2, y = Pop1, fill = Coverage)) +
  geom_tile(color = "white") +
  geom_text(aes(label = sprintf("%.0f%%", Coverage)), size = 2.5) +
  scale_fill_gradient(low = "white", high = "#2166AC", name = "Coverage (%)") +
  facet_wrap(~Study, ncol = 5) +
  scale_x_continuous(breaks = 1:3, labels = paste0("Pop", 1:3)) +
  scale_y_continuous(breaks = 1:3, labels = paste0("Pop", 1:3)) +
  labs(
    title = "Coverage of Population Structure Blocks by Study",
    x = "Population 2",
    y = "Population 1"
  ) +
  theme_minimal(base_size = 9) +
  theme(
    panel.grid = element_blank(),
    plot.title = element_text(hjust = 0.5, face = "bold"),
    strip.text = element_text(face = "bold")
  ) +
  coord_equal()

Heatmaps summarizing within- and between-population coverage percentages for every study.

This coverage analysis shows:

Summary

We have generated a realistic simulation scenario with:

The heatmaps above show: 1. The true population GRM (based on all 10^{4} markers and all 1000 genotypes) 2. Ten study-specific GRMs, each calculated on partial populations with partial marker information

Key features of this simulation: - Realistic sampling: Each study observes only a subset of individuals and markers - Population structure: Studies follow subpopulation structure but with intentional overlaps - Marker overlap: ~40% of markers overlap between consecutive studies - Genotype overlap: Individuals sampled multiple times have higher probability in later studies

The sample GRMs show realistic variation around the true relationships due to: - Finite marker sampling (each study uses ~15% of available markers) - Different marker sets across studies (with partial overlap) - Incomplete population sampling (each study observes ~40-70% of individuals) - Sampling variance in relationship estimation

Now we will use CovCombR to combine these incomplete GRMs and recover the full population relationship matrix.

Combining GRMs with CovCombR

Prepare Data for CovCombR

CovCombR expects covariance matrices scaled as Wishart sufficient statistics along with degrees of freedom. For GRMs:

  • Each sample GRM is already a covariance estimate
  • Degrees of freedom approximate the effective number of independent markers
  • We scale each GRM by its effective sample size
# For each study, prepare the Wishart-scale matrix and degrees of freedom
# The GRM is already an estimate of the covariance, so we need to scale it
# appropriately and provide degrees of freedom

# Effective degrees of freedom can be approximated by the number of markers
# For Van Raden GRM: nu approximately equals n_markers
# We'll use a slightly conservative estimate

S_list_wishart <- lapply(1:n_studies, function(k) {
  study_grm <- study_grms[[k]]

  # The GRM is already a covariance estimate on the data scale
  # fit_covcomb expects covariance matrices on the data scale (not per-df)
  # So we pass the GRM directly without any scaling
  S_k <- study_grm

  return(S_k)
})

names(S_list_wishart) <- paste0("Study", 1:n_studies)

# Degrees of freedom for each study
nu_vec <- sapply(1:n_studies, function(k) {
  round(study_marker_counts[k] * 0.8)
})
names(nu_vec) <- paste0("Study", 1:n_studies)

cat("Prepared data for CovCombR:\n")
## Prepared data for CovCombR:
for (k in 1:n_studies) {
  cat(sprintf(
    "  Study %2d: %d genotypes, nu = %d\n",
    k, nrow(study_grms[[k]]), nu_vec[k]
  ))
}
##   Study  1: 340 genotypes, nu = 4000
##   Study  2: 278 genotypes, nu = 4000
##   Study  3: 375 genotypes, nu = 4000
##   Study  4: 385 genotypes, nu = 4000
##   Study  5: 372 genotypes, nu = 4000
##   Study  6: 362 genotypes, nu = 4000
##   Study  7: 270 genotypes, nu = 4000
##   Study  8: 293 genotypes, nu = 4000
##   Study  9: 393 genotypes, nu = 4000
##   Study 10: 271 genotypes, nu = 4000

Fit CovCombR Model

Now we fit the CovCombR model to combine the incomplete GRMs. We use a small number of iterations for fast testing.

# Fit CovCombR with reduced iterations for testing
fit <- fit_covcomb(
  S_list = S_list_wishart,
  nu = nu_vec,
  init_sigma = "identity",
  scale_method = "none",
  control = list(
    max_iter = 50, # Small number for fast testing
    tol = 1e-5,
    ridge = 1e-6
  ),
  se_method = "none" # Skip SE computation for speed
)

print(fit)
## Wishart EM Covariance Combination
## Call: fit_covcomb(S_list = S_list_wishart, nu = nu_vec, scale_method = "none", 
##     init_sigma = "identity", control = list(max_iter = 50, tol = 1e-05, 
##         ridge = 1e-06), se_method = "none")
## 
## Status: did not converge in 50 iterations
## Final relative change: 1.91e-03
## 
## Combined covariance matrix (Sigma_hat): 860 x 860
##   Mean diagonal: 0.9070
##   Condition number: 5.58e+05
# plot log-likelihood convergence
plot(fit$history$log_likelihood,
  type = "o", xlab = "Iteration", ylab = "Log-Likelihood",
  main = "CovCombR Log-Likelihood Convergence"
)

Line plot of CovCombR log-likelihood across EM iterations for the simulated GRM example.

# Extract the combined GRM
combined_grm_raw <- fit$S_hat

# IMPORTANT: Rescale to match the true GRM scale
# Each study GRM was independently scaled to have mean diagonal = 1
# But the absolute variance scale differs due to different marker counts
# We need to rescale the combined GRM to match the true GRM's scale
#
# Strategy: Use the diagonal elements (which represent self-relationships)
# These should be close to 1.0 in a properly scaled GRM
# We'll rescale so that the mean diagonal matches the true GRM's mean diagonal
true_mean_diag <- mean(diag(true_grm))
combined_mean_diag <- mean(diag(combined_grm_raw))
scale_factor <- true_mean_diag / combined_mean_diag

cat("\nRescaling combined GRM:\n")
## 
## Rescaling combined GRM:
cat(sprintf("  True GRM mean diagonal: %.4f\n", true_mean_diag))
##   True GRM mean diagonal: 1.0000
cat(sprintf("  Combined GRM mean diagonal (before rescaling): %.4f\n", combined_mean_diag))
##   Combined GRM mean diagonal (before rescaling): 0.9070
cat(sprintf("  Scale factor applied: %.4f\n", scale_factor))
##   Scale factor applied: 1.1025
combined_grm_raw <- combined_grm_raw * scale_factor

cat(sprintf("  Combined GRM mean diagonal (after rescaling): %.4f\n", mean(diag(combined_grm_raw))))
##   Combined GRM mean diagonal (after rescaling): 1.0000
# Check which genotypes are in the combined GRM
combined_genotype_ids <- rownames(combined_grm_raw)

cat("\nCombined GRM coverage:\n")
## 
## Combined GRM coverage:
cat(sprintf(
  "  Genotypes in combined GRM: %d / %d\n",
  length(combined_genotype_ids), n_total
))
##   Genotypes in combined GRM: 860 / 1000
# Create full population matrix, filling in with NA for unobserved genotypes
combined_grm <- matrix(NA, n_total, n_total)
rownames(combined_grm) <- genotype_ids
colnames(combined_grm) <- genotype_ids

# Fill in observed genotypes
combined_grm[combined_genotype_ids, combined_genotype_ids] <- combined_grm_raw

cat("\nCombined GRM statistics (observed genotypes only):\n")
## 
## Combined GRM statistics (observed genotypes only):
cat(sprintf("  Dimensions: %d x %d\n", nrow(combined_grm_raw), ncol(combined_grm_raw)))
##   Dimensions: 860 x 860
cat(sprintf("  Mean diagonal: %.3f\n", mean(diag(combined_grm_raw))))
##   Mean diagonal: 1.000
cat(sprintf("  Mean off-diagonal: %.3f\n", mean(combined_grm_raw[upper.tri(combined_grm_raw)])))
##   Mean off-diagonal: -0.001
cat(sprintf("  SD off-diagonal: %.3f\n", sd(combined_grm_raw[upper.tri(combined_grm_raw)])))
##   SD off-diagonal: 0.220

Evaluate Recovery Quality

Compare the CovCombR combined GRM to the true population GRM.

# Create comparison visualization
true_grm_plot <- plot_aligned_grm(
  true_grm,
  title = "True Population GRM",
  ordered_ids = ordered_genotype_ids,
  text_size = 10
)

combined_grm_plot <- plot_aligned_grm(
  combined_grm,
  title = "CovCombR Combined GRM",
  ordered_ids = ordered_genotype_ids,
  text_size = 10
)

# Calculate residuals (only for observed genotypes)
residual_grm <- combined_grm - true_grm

residual_plot <- ggplot(data.frame(
  Row = rep(1:n_total, n_total),
  Col = rep(1:n_total, each = n_total),
  Residual = as.vector(residual_grm[ordered_genotype_ids, ordered_genotype_ids])
), aes(x = Col, y = n_total - Row + 1, fill = Residual)) +
  geom_tile() +
  scale_fill_gradient2(
    low = "#2166AC",
    mid = "white",
    high = "#B2182B",
    midpoint = 0,
    limits = c(-0.5, 0.5),
    name = "Residual",
    na.value = "#CCCCCC"
  ) +
  labs(title = "Residuals (Combined - True)", x = NULL, y = NULL) +
  theme_minimal(base_size = 10) +
  theme(
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank(),
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.position = "right"
  ) +
  coord_equal()

# Scatter plot: True vs Combined with categorical coloring
# First, determine which pairs were observed in at least one study
observed_pairs <- matrix(FALSE, n_total, n_total)
rownames(observed_pairs) <- genotype_ids
colnames(observed_pairs) <- genotype_ids

for (k in 1:n_studies) {
  study_idx <- study_genotypes[[k]]
  study_ids <- genotype_ids[study_idx]
  observed_pairs[study_ids, study_ids] <- TRUE
}

# Create indices for classification
upper_tri_with_diag <- upper.tri(true_grm, diag = TRUE)
diag_idx <- matrix(FALSE, n_total, n_total)
diag(diag_idx) <- TRUE

# Classify each pair
pair_type <- rep(NA_character_, sum(upper_tri_with_diag))
true_vals <- true_grm[upper_tri_with_diag]
combined_vals <- combined_grm[upper_tri_with_diag]
diag_vals <- diag_idx[upper_tri_with_diag]
observed_vals <- observed_pairs[upper_tri_with_diag]

# Classify: diagonal, observed off-diagonal, or unobserved off-diagonal
pair_type[diag_vals] <- "Diagonal"
pair_type[!diag_vals & observed_vals] <- "Observed (off-diagonal)"
pair_type[!diag_vals & !observed_vals] <- "Unobserved (imputed)"

scatter_data <- data.frame(
  True = true_vals,
  Combined = combined_vals,
  Type = factor(pair_type, levels = c("Diagonal", "Observed (off-diagonal)", "Unobserved (imputed)"))
)

# Remove NA values (should only be unobserved genotypes)
scatter_data <- scatter_data[!is.na(scatter_data$Combined), ]

# Print counts for each category
cat("\nScatter plot point counts:\n")
## 
## Scatter plot point counts:
cat(sprintf("  Diagonal: %d\n", sum(scatter_data$Type == "Diagonal", na.rm = TRUE)))
##   Diagonal: 860
cat(sprintf("  Observed (off-diagonal): %d\n", sum(scatter_data$Type == "Observed (off-diagonal)", na.rm = TRUE)))
##   Observed (off-diagonal): 294045
cat(sprintf("  Unobserved (imputed): %d\n", sum(scatter_data$Type == "Unobserved (imputed)", na.rm = TRUE)))
##   Unobserved (imputed): 75325
# Define colors for each category
type_colors <- c(
  "Diagonal" = "#E31A1C", # Red for diagonal
  "Observed (off-diagonal)" = "#1F78B4", # Blue for observed
  "Unobserved (imputed)" = "#33A02C" # Green for imputed
)

scatter_plot <- ggplot(scatter_data, aes(x = True, y = Combined, color = Type, alpha = Type)) +
  geom_point(size = 1.5) +
  geom_abline(slope = 1, intercept = 0, color = "black", linetype = "dashed", linewidth = 0.8) +
  scale_color_manual(
    values = type_colors,
    name = "Pair Type",
    drop = FALSE
  ) +
  scale_alpha_manual(
    values = c("Diagonal" = 0.8, "Observed (off-diagonal)" = 0.6, "Unobserved (imputed)" = 0.4),
    name = "Pair Type",
    drop = FALSE
  ) +
  labs(
    title = "True vs Combined GRM",
    x = "True Relationship",
    y = "CovCombR Combined Relationship"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.position = "bottom",
    legend.box.spacing = unit(0.1, "cm")
  )

# Arrange plots (2x2 grid)
grid.arrange(
  true_grm_plot, combined_grm_plot,
  residual_plot, scatter_plot,
  ncol = 2, nrow = 2
)

Heatmaps comparing the true GRM, CovCombR combined GRM, and residual differences across genotypes.

Recovery Statistics

# Calculate recovery statistics
upper_tri_idx <- upper.tri(true_grm, diag = TRUE)

# Overall statistics (only for genotypes present in combined GRM)
valid_idx <- upper_tri_idx & !is.na(combined_grm)
cat("Overall Recovery Statistics (genotypes in combined GRM):\n")
## Overall Recovery Statistics (genotypes in combined GRM):
cat(sprintf(
  "  N pairs: %d (%.1f%% of total)\n",
  sum(valid_idx),
  100 * sum(valid_idx) / sum(upper_tri_idx)
))
##   N pairs: 370230 (74.0% of total)
cat(sprintf(
  "  RMSE: %.4f\n",
  sqrt(mean((combined_grm[valid_idx] - true_grm[valid_idx])^2, na.rm = TRUE))
))
##   RMSE: 0.0507
cat(sprintf(
  "  MAE: %.4f\n",
  mean(abs(combined_grm[valid_idx] - true_grm[valid_idx]), na.rm = TRUE)
))
##   MAE: 0.0355
cat(sprintf(
  "  Correlation: %.4f\n",
  cor(combined_grm[valid_idx], true_grm[valid_idx], use = "complete.obs")
))
##   Correlation: 0.9792
# Separate statistics for observed vs unobserved pairs
# Note: observed_pairs was already computed in the evaluate-recovery chunk
observed_idx <- observed_pairs & upper_tri_idx
unobserved_idx <- (!observed_pairs) & upper_tri_idx

# Additionally filter out NA values (genotypes not in any study)
# combined_grm may have NA for genotypes never observed
observed_valid <- observed_idx & !is.na(combined_grm)
unobserved_valid <- unobserved_idx & !is.na(combined_grm)

if (sum(observed_valid) > 0) {
  cat("\nObserved Pairs (in at least one study):\n")
  cat(sprintf(
    "  N pairs: %d (%.1f%%)\n",
    sum(observed_valid),
    100 * sum(observed_valid) / sum(upper_tri_idx)
  ))
  cat(sprintf(
    "  RMSE: %.4f\n",
    sqrt(mean((combined_grm[observed_valid] - true_grm[observed_valid])^2, na.rm = TRUE))
  ))
  cat(sprintf(
    "  MAE: %.4f\n",
    mean(abs(combined_grm[observed_valid] - true_grm[observed_valid]), na.rm = TRUE)
  ))
  cat(sprintf(
    "  Correlation: %.4f\n",
    cor(combined_grm[observed_valid], true_grm[observed_valid], use = "complete.obs")
  ))
}
## 
## Observed Pairs (in at least one study):
##   N pairs: 294905 (58.9%)
##   RMSE: 0.0335
##   MAE: 0.0251
##   Correlation: 0.9906
if (sum(unobserved_valid) > 0) {
  cat("\nUnobserved Pairs (never jointly observed, but both genotypes in studies):\n")
  cat(sprintf(
    "  N pairs: %d (%.1f%%)\n",
    sum(unobserved_valid),
    100 * sum(unobserved_valid) / sum(upper_tri_idx)
  ))
  cat(sprintf(
    "  RMSE: %.4f\n",
    sqrt(mean((combined_grm[unobserved_valid] - true_grm[unobserved_valid])^2, na.rm = TRUE))
  ))
  cat(sprintf(
    "  MAE: %.4f\n",
    mean(abs(combined_grm[unobserved_valid] - true_grm[unobserved_valid]), na.rm = TRUE)
  ))
  cat(sprintf(
    "  Correlation: %.4f\n",
    cor(combined_grm[unobserved_valid], true_grm[unobserved_valid], use = "complete.obs")
  ))
}
## 
## Unobserved Pairs (never jointly observed, but both genotypes in studies):
##   N pairs: 75325 (15.0%)
##   RMSE: 0.0907
##   MAE: 0.0763
##   Correlation: 0.9638
# Statistics by subpopulation blocks
cat("\nRecovery by Population Structure Blocks:\n")
## 
## Recovery by Population Structure Blocks:
for (pop1 in 1:3) {
  for (pop2 in pop1:3) {
    idx1 <- which(subpop_labels == pop1)
    idx2 <- which(subpop_labels == pop2)

    block_mask <- matrix(FALSE, n_total, n_total)
    block_mask[idx1, idx2] <- TRUE
    block_mask[idx2, idx1] <- TRUE

    block_idx <- block_mask & upper_tri_idx
    block_valid <- block_idx & !is.na(combined_grm)

    if (sum(block_valid) > 0) {
      block_rmse <- sqrt(mean((combined_grm[block_valid] - true_grm[block_valid])^2, na.rm = TRUE))
      block_cor <- cor(combined_grm[block_valid], true_grm[block_valid], use = "complete.obs")

      cat(sprintf(
        "  Pop%d-Pop%d: N=%d, RMSE=%.4f, Cor=%.4f\n",
        pop1, pop2, sum(block_valid), block_rmse, block_cor
      ))
    }
  }
}
##   Pop1-Pop1: N=57291, RMSE=0.0516, Cor=0.7445
##   Pop1-Pop2: N=100386, RMSE=0.0549, Cor=0.2390
##   Pop1-Pop3: N=76050, RMSE=0.0385, Cor=0.2105
##   Pop2-Pop2: N=44253, RMSE=0.0630, Cor=0.7217
##   Pop2-Pop3: N=66825, RMSE=0.0292, Cor=0.3486
##   Pop3-Pop3: N=25425, RMSE=0.0770, Cor=0.7469

Session Information

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.7.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] gridExtra_2.3  tidyr_1.3.1    tibble_3.3.0   dplyr_1.1.4    ggplot2_4.0.0 
## [6] CovCombR_1.5.0
## 
## loaded via a namespace (and not attached):
##  [1] vctrs_0.6.5        cli_3.6.5          knitr_1.50         rlang_1.1.6       
##  [5] xfun_0.54          purrr_1.1.0        generics_0.1.4     S7_0.2.0          
##  [9] jsonlite_2.0.0     labeling_0.4.3     glue_1.8.0         htmltools_0.5.8.1 
## [13] tinytex_0.57       sass_0.4.10        scales_1.4.0       rmarkdown_2.30    
## [17] grid_4.4.1         evaluate_1.0.5     jquerylib_0.1.4    fastmap_1.2.0     
## [21] yaml_2.3.10        lifecycle_1.0.4    compiler_4.4.1     RColorBrewer_1.1-3
## [25] pkgconfig_2.0.3    farver_2.1.2       digest_0.6.37      R6_2.6.1          
## [29] tidyselect_1.2.1   pillar_1.11.1      magrittr_2.0.4     bslib_0.9.0       
## [33] withr_3.0.2        tools_4.4.1        gtable_0.3.6       cachem_1.1.0